-
Notifications
You must be signed in to change notification settings - Fork 14.5k
ELF: CFI jump table relaxation. #147424
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/pcc/spr/main.elf-cfi-jump-table-relaxation
Are you sure you want to change the base?
ELF: CFI jump table relaxation. #147424
Conversation
Created using spr 1.3.6-beta.1
@llvm/pr-subscribers-lld-elf @llvm/pr-subscribers-lld Author: Peter Collingbourne (pcc) ChangesIndirection via the jump table increases the icache and TLB miss rate The basic idea is to eliminate the indirection by moving function
In both cases, we may move the function body into the jump table by We leave the last function in the jump table at its original location Jump table relaxation was found to reduce the overhead of CFI in a large TODO: The jump table relaxation optimization as implemented is not sound in
Of course, once we've decided on the appropriate way to identify the This implementation is for X86_64 only. I considered whether it would be [1] In this prototype implementation, I made these additional assumptions
I think it should be possible to find a way to avoid making these Full diff: https://github.com/llvm/llvm-project/pull/147424.diff 3 Files Affected:
diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index 488f4803b2cb4..04ca79befdc4a 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -318,6 +318,9 @@ bool X86_64::deleteFallThruJmpInsn(InputSection &is, InputFile *file,
}
bool X86_64::relaxOnce(int pass) const {
+ if (pass == 0)
+ relaxJumpTables(ctx);
+
uint64_t minVA = UINT64_MAX, maxVA = 0;
for (OutputSection *osec : ctx.outputSections) {
if (!(osec->flags & SHF_ALLOC))
@@ -1231,6 +1234,98 @@ void X86_64::applyBranchToBranchOpt() const {
redirectControlTransferRelocations);
}
+void elf::relaxJumpTables(Ctx &ctx) {
+ // Relax CFI jump tables.
+ // - Split jump table into pieces and place target functions inside the jump
+ // table if small enough.
+ // - Move jump table before last called function and delete last branch
+ // instruction.
+ std::map<InputSection *, std::vector<InputSection *>> sectionReplacements;
+ SmallVector<InputSection *, 0> storage;
+ for (OutputSection *osec : ctx.outputSections) {
+ if (!(osec->flags & SHF_EXECINSTR))
+ continue;
+ for (InputSection *sec : getInputSections(*osec, storage)) {
+ if (!sec->name.starts_with(".text..L.cfi.jumptable"))
+ continue;
+ std::vector<InputSection *> replacements;
+ replacements.push_back(sec);
+ auto addSectionSlice = [&](size_t begin, size_t end, Relocation *rbegin,
+ Relocation *rend) {
+ if (begin == end)
+ return;
+ auto *slice = make<InputSection>(
+ sec->file, sec->name, sec->type, sec->flags, 1, sec->entsize,
+ sec->contentMaybeDecompress().slice(begin, end - begin));
+ for (const Relocation &r : ArrayRef<Relocation>(rbegin, rend)) {
+ slice->relocations.push_back(
+ Relocation{r.expr, r.type, r.offset - begin, r.addend, r.sym});
+ }
+ replacements.push_back(slice);
+ };
+ auto getMovableSection = [&](Relocation &r) -> InputSection * {
+ auto *sym = dyn_cast_or_null<Defined>(r.sym);
+ if (!sym || sym->isPreemptible || sym->isGnuIFunc() || sym->value != 0)
+ return nullptr;
+ auto *sec = dyn_cast_or_null<InputSection>(sym->section);
+ if (!sec || sectionReplacements.count(sec))
+ return nullptr;
+ return sec;
+ };
+ size_t begin = 0;
+ Relocation *rbegin = sec->relocs().begin();
+ for (auto &r : sec->relocs().slice(0, sec->relocs().size() - 1)) {
+ auto entrySize = (&r + 1)->offset - r.offset;
+ InputSection *target = getMovableSection(r);
+ if (!target || target->size > entrySize)
+ continue;
+ target->addralign = 1;
+ addSectionSlice(begin, r.offset - 1, rbegin, &r);
+ replacements.push_back(target);
+ sectionReplacements[target] = {};
+ begin = r.offset - 1 + target->size;
+ rbegin = &r + 1;
+ }
+ InputSection *lastSec = getMovableSection(sec->relocs().back());
+ if (lastSec) {
+ lastSec->addralign = 1;
+ addSectionSlice(begin, sec->relocs().back().offset - 1, rbegin,
+ &sec->relocs().back());
+ replacements.push_back(lastSec);
+ sectionReplacements[sec] = {};
+ sectionReplacements[lastSec] = replacements;
+ for (auto *s : replacements)
+ s->parent = lastSec->parent;
+ } else {
+ addSectionSlice(begin, sec->size, rbegin, sec->relocs().end());
+ sectionReplacements[sec] = replacements;
+ for (auto *s : replacements)
+ s->parent = sec->parent;
+ }
+ sec->relocations.clear();
+ sec->size = 0;
+ }
+ }
+ for (OutputSection *osec : ctx.outputSections) {
+ if (!(osec->flags & SHF_EXECINSTR))
+ continue;
+ for (SectionCommand *cmd : osec->commands) {
+ auto *isd = dyn_cast<InputSectionDescription>(cmd);
+ if (!isd)
+ continue;
+ SmallVector<InputSection *> newSections;
+ for (auto *sec : isd->sections) {
+ auto i = sectionReplacements.find(sec);
+ if (i == sectionReplacements.end())
+ newSections.push_back(sec);
+ else
+ newSections.append(i->second.begin(), i->second.end());
+ }
+ isd->sections = std::move(newSections);
+ }
+ }
+}
+
// If Intel Indirect Branch Tracking is enabled, we have to emit special PLT
// entries containing endbr64 instructions. A PLT entry will be split into two
// parts, one in .plt.sec (writePlt), and the other in .plt (writeIBTPlt).
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index cebd564036b2c..f7e3d54878395 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1674,7 +1674,7 @@ void RelocationScanner::scan(Relocs<RelTy> rels) {
// R_RISCV_PCREL_HI20, R_PPC64_ADDR64 and the branch-to-branch optimization.
if (ctx.arg.emachine == EM_RISCV ||
(ctx.arg.emachine == EM_PPC64 && sec->name == ".toc") ||
- ctx.arg.branchToBranch)
+ ctx.arg.branchToBranch || sec->name.starts_with(".text..L.cfi.jumptable"))
llvm::stable_sort(sec->relocs(),
[](const Relocation &lhs, const Relocation &rhs) {
return lhs.offset < rhs.offset;
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index 6dd20b2f0cbaa..e6eb33fa5338c 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -195,6 +195,7 @@ void setSPARCV9TargetInfo(Ctx &);
void setSystemZTargetInfo(Ctx &);
void setX86TargetInfo(Ctx &);
void setX86_64TargetInfo(Ctx &);
+void relaxJumpTables(Ctx &);
struct ErrorPlace {
InputSectionBase *isec;
|
There is no accompanying test, making its transformations unclear. Relying on a "magic" section name to trigger transformations feels unreliable and imprecise. Introducing a new section type and relocation type might justify the feature. |
Apologies if the description wasn't enough. My intent was to write a test once we agree on the protocol between the compiler and the linker. I'll try to illustrate my goal with some assembly:
I want the linker to do this:
Agreed. I used a magic section name for expediency so that I'd have something to share in this PR. So do you think that both a new section type and a new relocation type are needed? I was thinking that either/or would be enough. |
…ple PRs Created using spr 1.3.6-beta.1
Okay, I've added the section type (#149259), made this use it, polished the implementation and added a test. |
You can test this locally with the following command:git-clang-format --diff HEAD~1 HEAD --extensions h,cpp -- lld/ELF/Arch/X86_64.cpp lld/ELF/OutputSections.cpp lld/ELF/Relocations.cpp lld/ELF/Target.h lld/ELF/Writer.cpp View the diff from clang-format here.diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index dc3cb4ee9..bacc7f758 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -401,9 +401,9 @@ void X86_64::relaxCFIJumpTables() const {
targetOutputSec = sec->getParent();
}
- // Walk the jump table entries other than the last one looking for sections
- // that are small enough to be moved into the jump table and in the same
- // section as the jump table's destination.
+ // Walk the jump table entries other than the last one looking for
+ // sections that are small enough to be moved into the jump table and in
+ // the same section as the jump table's destination.
size_t begin = 0;
Relocation *rbegin = sec->relocs().begin();
size_t cur = begin;
|
Created using spr 1.3.6-beta.1
Indirection via the jump table increases the icache and TLB miss rate
associated with indirect calls, and according to internal benchmarking
was identified as one of the main runtime costs of CFI, contributing
around 30% of the total overhead. #145579 addressed the problem for
direct calls to jump table entries, but the indirect call overhead is
still present. This patch implements jump table relaxation, which is a
technique for opportunistically reducing the indirect call overhead.
The basic idea is to eliminate the indirection by moving function
bodies into the jump table wherever possible. This is possible in two
circumstances:
In both cases, we may move the function body into the jump table by
splitting the jump table in two, with enough space in the middle for the
function body, and placing the function there.
We leave the last function in the jump table at its original location
and place the rest of the jump table behind it. The goal of this is to
decrease the TLB miss rate, on the assumption that it is more likely
for functions with the same type (and their callees) to be in the same
page as each other than for them to be in the same page as the original
location of the jump table (typically clustered together near the end
of the binary).
A complete implementation of jump table relaxation was found to reduce
the overhead of CFI in a large realistic internal Google benchmark
by between 0.2 and 0.5 percentage points, or 10-25%, depending on the
microarchitecture.